Clustering Techniques

Author
Affiliation

Manas P Panse

College of Information Science, University of Arizona

0 - Pre-Checks

# Checking Python Version

!python --version
Python 3.12.3
# Importing Necessary Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.stats import zscore
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

# Setting Plot Style
sns.set(style = "white")
# Importing Dataset
energy_df = pd.read_csv("data/hw-05/owid-energy.csv")

1 - Data Preparation

Data Overview

energy_df.head()
country year iso_code population gdp biofuel_cons_change_pct biofuel_cons_change_twh biofuel_cons_per_capita biofuel_consumption biofuel_elec_per_capita ... solar_share_elec solar_share_energy wind_cons_change_pct wind_cons_change_twh wind_consumption wind_elec_per_capita wind_electricity wind_energy_per_capita wind_share_elec wind_share_energy
0 Afghanistan 1900 AFG 4832414.0 NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 Afghanistan 1901 AFG 4879685.0 NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 Afghanistan 1902 AFG 4935122.0 NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 Afghanistan 1903 AFG 4998861.0 NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 Afghanistan 1904 AFG 5063419.0 NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 129 columns

energy_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21890 entries, 0 to 21889
Columns: 129 entries, country to wind_share_energy
dtypes: float64(126), int64(1), object(2)
memory usage: 21.5+ MB

Shape

The dataset contains 21890 ROWS and 129 COLUMNS.

DataTypes

  1. int64 : 01 column.
  2. object : 02 columns.
  3. float64 : 126 columns.

Descriptive Statistics

energy_df.describe()
year population gdp biofuel_cons_change_pct biofuel_cons_change_twh biofuel_cons_per_capita biofuel_consumption biofuel_elec_per_capita biofuel_electricity biofuel_share_elec ... solar_share_elec solar_share_energy wind_cons_change_pct wind_cons_change_twh wind_consumption wind_elec_per_capita wind_electricity wind_energy_per_capita wind_share_elec wind_share_energy
count 21890.000000 1.802900e+04 1.111300e+04 862.000000 1337.000000 952.000000 1372.000000 5221.000000 5442.000000 5407.000000 ... 6871.000000 5442.000000 2295.000000 5340.000000 5445.000000 7789.000000 8676.000000 4779.000000 6871.000000 5445.000000
mean 1973.661261 1.045117e+08 3.585114e+11 34.143052 3.449835 159.620382 47.051201 65.366646 11.032376 2.030252 ... 0.580494 0.129717 274.509119 5.647746 40.340626 54.478592 14.571141 175.599518 1.358409 0.440739
std 34.960962 4.593929e+08 2.411179e+12 227.488193 11.674255 269.540042 125.445899 202.092082 46.728412 5.391375 ... 2.009426 0.475138 6084.992396 31.723429 224.719509 236.096397 86.388161 623.300059 4.443910 1.533429
min 1900.000000 1.833000e+03 1.642060e+08 -100.000000 -54.584000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 -100.000000 -42.829000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 1945.000000 1.691561e+06 1.365898e+10 0.013250 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 4.665000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 1984.000000 6.968070e+06 4.167411e+10 8.251000 0.005000 18.697000 2.704500 0.137000 0.010000 0.067000 ... 0.000000 0.000000 20.944000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
75% 2003.000000 2.538869e+07 1.744295e+11 25.328000 2.153000 238.256000 25.975500 36.301000 0.740000 1.610000 ... 0.062000 0.004000 50.000000 0.105250 0.774000 0.715000 0.059000 13.208500 0.325500 0.053000
max 2022.000000 7.909295e+09 1.136302e+14 5659.328000 136.261000 1747.467000 1139.921000 2524.931000 666.280000 71.429000 ... 40.000000 5.999000 242384.844000 679.413000 4872.095000 3219.852000 1848.260000 7361.917000 56.840000 24.614000

8 rows × 127 columns

Handling Missing Values

energy_df.isnull().sum()
country                       0
year                          0
iso_code                   5470
population                 3861
gdp                       10777
                          ...  
wind_elec_per_capita      14101
wind_electricity          13214
wind_energy_per_capita    17111
wind_share_elec           15019
wind_share_energy         16445
Length: 129, dtype: int64

Handling duplicate Values

energy_df.duplicated().sum()
0

Column Separations for Future Use

# Numerical Columns
numeric_cols = energy_df.select_dtypes(include = ['int64', 'float64']).columns

# Categorical Columns
categoric_cols = energy_df.select_dtypes(include = ['object', 'bool']).columns

Given Code

# Selecting a subset of columns that are most relevant to Energy Consumption, Production, and Environmental Impact
relevant_columns = [
    'country', 'year', 'iso_code', 'population', 'gdp',
    'biofuel_consumption', 'coal_consumption', 'gas_consumption', 'oil_consumption',
    'renewables_consumption', 'nuclear_consumption', 'fossil_fuel_consumption', 'low_carbon_consumption',
    'electricity_generation', 'primary_energy_consumption', 'carbon_intensity_elec',
    'greenhouse_gas_emissions'
]

# Creating a new DataFrame with the Relevant Columns
relevant_df = energy_df[relevant_columns]

# For simplicity, we will fill missing values in consumption and generation columns with zeros, as missing values can logically imply no consumption/production
consumption_generation_columns = [
    'biofuel_consumption', 'coal_consumption', 'gas_consumption', 'oil_consumption',
    'renewables_consumption', 'nuclear_consumption', 'fossil_fuel_consumption', 'low_carbon_consumption',
    'electricity_generation', 'primary_energy_consumption'
]
relevant_df.loc[:, consumption_generation_columns] = relevant_df[consumption_generation_columns].fillna(0)

# Checking if there are any object types that should be converted or other data type corrections
# Summary of the Cleaned DataFrame
relevant_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21890 entries, 0 to 21889
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   country                     21890 non-null  object 
 1   year                        21890 non-null  int64  
 2   iso_code                    16420 non-null  object 
 3   population                  18029 non-null  float64
 4   gdp                         11113 non-null  float64
 5   biofuel_consumption         21890 non-null  float64
 6   coal_consumption            21890 non-null  float64
 7   gas_consumption             21890 non-null  float64
 8   oil_consumption             21890 non-null  float64
 9   renewables_consumption      21890 non-null  float64
 10  nuclear_consumption         21890 non-null  float64
 11  fossil_fuel_consumption     21890 non-null  float64
 12  low_carbon_consumption      21890 non-null  float64
 13  electricity_generation      21890 non-null  float64
 14  primary_energy_consumption  21890 non-null  float64
 15  carbon_intensity_elec       5079 non-null   float64
 16  greenhouse_gas_emissions    5220 non-null   float64
dtypes: float64(14), int64(1), object(2)
memory usage: 2.8+ MB

Task 1 - Exploratory Data Analysis (2 Cr.)

Distribution of primary_energy_consumption.

# Checking Missing & Zero Values.
print("Missing Values :", relevant_df['primary_energy_consumption'].isnull().sum())
print("Zero Values :", (relevant_df['primary_energy_consumption'] == 0).sum())
Missing Values : 0
Zero Values : 9894
# Filtering out Zeros for meaningful visualization
filtered_data = relevant_df['primary_energy_consumption'][relevant_df['primary_energy_consumption'] > 0]
# Plotting
plt.figure(figsize = (8, 6))
sns.histplot(filtered_data, kde = True, color = "tomato")
plt.title("Distribution of Primary Energy Consumption (Non-Zero Values)")
plt.xlabel("Primary Energy Consumption")
plt.ylabel("Frequency")
plt.show()

You know I don’t mean to sound unprofessional, but above plot looks absolutely hideous. Let’s fix that …

# Applying Log Transformation excluding Zero Values
log_consumption = np.log(filtered_data)

# Plotting Log-Transformed Values
plt.figure(figsize = (8, 6))
sns.histplot(log_consumption, kde = True, color = "tomato")
plt.title("Log-Transformed Distribution of Primary Energy Consumption (Non-Zero Values)")
plt.xlabel("Log of Primary Energy Consumption")
plt.ylabel("Frequency")
plt.show()

Distribution of electricity_generation

# Checking for Missing and Zero Values
print("Missing Values :", relevant_df['electricity_generation'].isnull().sum())
print("Zero Values :", (relevant_df['electricity_generation'] == 0).sum())
Missing Values : 0
Zero Values : 14964
# Filtering out Zero for meaningful visualization
filtered_electricity_gen = relevant_df['electricity_generation'][relevant_df['electricity_generation'] > 0]
# Plotting
plt.figure(figsize = (8, 6))
sns.histplot(filtered_electricity_gen, kde = True, color = "orange")
plt.title("Distribution of Electricity Generation (Non-Zero Values)")
plt.xlabel("Electricity Generation")
plt.ylabel("Frequency")
plt.show()

And again, the hideous-ness continues here … let’s fix that too !

# Applying Log Transformation excluding Zero Values
log_electricity_gen = np.log(filtered_electricity_gen)

# Plotting Log Transformed Values
plt.figure(figsize = (8, 6))
sns.histplot(log_electricity_gen, kde = True, color = "orange")
plt.title("Log-Transformed Distribution of Electricity Generation (Non-Zero Values)")
plt.xlabel("Log of Electricity Generation")
plt.ylabel("Frequency")
plt.show()

Distribution of carbon_intensity_elec

# Checking Missing Values
print("Missing Values :", relevant_df['carbon_intensity_elec'].isnull().sum())
print("Zero Values :", (relevant_df['carbon_intensity_elec'] == 0).sum())
Missing Values : 16811
Zero Values : 24
# Filtering Missing & Zero Values
filtered_carbon_intensity = relevant_df['carbon_intensity_elec'][(relevant_df['carbon_intensity_elec'] > 0)]
# Plotting
plt.figure(figsize = (8, 6))
sns.histplot(filtered_carbon_intensity, kde = True, color = "slategray")
plt.title("Distribution of Carbon Intensity of Electricity")
plt.xlabel("Carbon Intensity of Electricity")
plt.ylabel("Frequency")
plt.show()

Coorelation Matrix

# Identifying columns that are in both numeric_cols and relevant_df
numeric_cols_in_relevant_df = [col for col in numeric_cols if col in relevant_df.columns]
correlation_matrix = relevant_df[numeric_cols_in_relevant_df].corr()

# Plotting
plt.figure(figsize = (8, 6))
sns.heatmap(correlation_matrix, annot = False, cmap = "coolwarm", center = 0, square = True, linewidths = 0.5)
plt.title("Correlation Matrix of Numerical Features")
plt.show()

Observations from the Heatmap

  1. There are strong correlations among different types of energy consumption and production metrics, as expected. For example, fossil fuel consumption is highly correlated with total primary energy consumption and electricity generation.

  2. Renewable energy consumption shows a positive correlation with low carbon consumption and electricity generation, indicating that countries with higher renewable energy use also tend to have higher overall low carbon energy usage.

  3. Carbon intensity of electricity has correlations with several types of energy consumption, which could inform clustering decisions based on environmental impact considerations.

2 - CLustering Methods Implementation and Analysis

Task 2 - Feature Selection and Data Preparation (2 Cr.)

# Step 1: Feature Selection - Focus on a mix of Consumption, Production, and Environmental Impact
features_for_clustering = [
    'biofuel_consumption', 'coal_consumption', 'gas_consumption', 'oil_consumption',
    'renewables_consumption', 'nuclear_consumption', 'fossil_fuel_consumption',
    'low_carbon_consumption', 'electricity_generation', 'primary_energy_consumption',
    'carbon_intensity_elec'
]
clustering_df = energy_df[features_for_clustering]
clustering_df = clustering_df.dropna(subset = ['carbon_intensity_elec'])

# Imputing Missing Values in remaining columns using Mean Strategy
imputer = SimpleImputer(strategy = 'mean')
clustering_df_imputed = pd.DataFrame(imputer.fit_transform(clustering_df), columns = clustering_df.columns)
# Performing Z-Scale Normalization
scaler = StandardScaler()
scaled_clustering_df = pd.DataFrame(
    scaler.fit_transform(clustering_df_imputed), columns = clustering_df_imputed.columns
)

# Checking to see if Data is Ready for Clustering
scaled_clustering_df.isnull().sum()
biofuel_consumption           0
coal_consumption              0
gas_consumption               0
oil_consumption               0
renewables_consumption        0
nuclear_consumption           0
fossil_fuel_consumption       0
low_carbon_consumption        0
electricity_generation        0
primary_energy_consumption    0
carbon_intensity_elec         0
dtype: int64

Task 3 - KMeans Clustering (4 Cr.)

# Within-Cluster Sum of Squares
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(scaled_clustering_df)
    wcss.append(kmeans.inertia_)

# Ploting the Elbow Graph
plt.figure(figsize = (8, 6))
plt.plot(range(1, 11), wcss, marker = 'o', linestyle = '--')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()

# Applying K-Means with Cluster Count 3
optimal_k = 3 
kmeans = KMeans(n_clusters = optimal_k, init = 'k-means++', random_state = 42)
clustering_df_imputed['Cluster'] = kmeans.fit_predict(scaled_clustering_df)

# Plotting the Cluster
sns.pairplot(clustering_df_imputed, hue = 'Cluster', palette = 'rainbow', corner = True)
plt.suptitle('KMeans Clustering Results', fontsize = 25)
plt.show()

Task 4 - Hierarchical Clustering (2 Cr.)

# Performing Hierarchical Clustering
linked = linkage(scaled_clustering_df, method = 'ward')

# Plotting a Dendrogram for Cluster Heirarchy
plt.figure(figsize = (8, 6))
dendrogram(linked, labels=clustering_df_imputed.index, leaf_rotation = 90, leaf_font_size = 10, no_labels = True)
plt.title('Dendrogram for Hierarchical Clustering')
plt.xlabel('Samples')
plt.ylabel('Euclidean Distances')
plt.xticks([])
plt.figtext(0.5, -0.05, 'NOTE: The sample ticks were removed to optimize performace and reduce clutter on screen.', wrap = True, horizontalalignment = 'center', fontsize = 5)
plt.show()

3 - Declaration of Independent Work

See HOMEPAGE for details